import requests
from bs4 import BeautifulSoup
import csv
from time import sleep
from os import path


def get_data(url: str):
    header = {
        'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",

        # 'referer': 'https://www.lagou.com/beijing-zhaopin/Java/5/?filterOption=3&sid=06511852808247b880ece7e210cec930'
        'Cookie': 'Your cookie',
    }

    res = requests.get(url=url, headers=header)
    print(res.status_code)
    # print(res.text)
    soup = BeautifulSoup(res.text, 'html.parser')
    #  获取公司名
    company_name = [i.get_text() for i in
                    soup.select('#s_position_list > ul > li div.list_item_top div.company_name > a')]
    # print(company_name)
    #  获取薪资区间
    salary = [i.get_text() for i in soup.select('#s_position_list > ul > li div.list_item_top div.position span.money')]
    # print(len(salary), salary)
    #  获取招聘岗位
    job = [i.get_text() for i in soup.select('#s_position_list > ul > li div.list_item_top div.p_top > a > h3')]
    # print(len(job), job)
    #  工作地点
    address = [i.get_text() for i in
               soup.select('#s_position_list > ul > li div.list_item_top div.p_top > a >span > em')]
    # print(len(address), address)
    #  工作经验
    experience = [i.get_text().strip().split(' / ')[0].split("\n")[-1] for i in
                  soup.select('#s_position_list > ul > li div.list_item_top div.p_bot div.li_b_l')]
    # print(len(experience), experience)
    #  学历要求
    job_needed = [i.get_text().strip().split(' / ')[1] for i in
                  soup.select('#s_position_list > ul > li div.list_item_top div.p_bot div.li_b_l')]
    # print(len(job_needed), job_needed)
    #  公司规模
    company_size = [i.get_text().split(' / ')[-1].strip() for i in
                    soup.select('#s_position_list > ul > li div.list_item_top div.company > div.industry')]
    # print(len(company_size), company_size)
    #  行业领域
    company_field = ['/'.join([i.strip() for i in i.get_text().split(' / ')[-2::-1]]) for i in
                     soup.select('#s_position_list > ul > li div.list_item_top div.company > div.industry')]
    # print(len(company_field), company_field)
    data = list(zip(company_name, salary, job, address, experience, job_needed, company_field, company_size))
    # print(res.text)
    next_page = soup.select('a.page_no:last-child')[0].get('href')
    # print(next_page)
    # print(soup.text)
    # next_page = soup.select('a.pager_is_current')[0].get_text()
    print(next_page)
    if 'javascript' not in next_page:
        write2csv(data)
        sleep(15)
        print('下载: ', next_page)
        get_data(next_page)


def write2csv(data: list):
    with open('./data_update.csv', 'a', encoding='utf-8') as wf:
        writer = csv.writer(wf)
        writer.writerows(data)


def main():
    #  获取主页主要岗位
    first_url = "https://www.lagou.com/"
    header = {
        'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",
        # 'Cookie': 'user_trace_token=20210323221151-9c81c1a3-cd04-4bda-b213-cf7437fa4e7f; _ga=GA1.2.711542371.1616508712; LGUID=20210323221152-ff1ea64f-16e3-49d2-b847-da6942bd0c09; gate_login_token=a9813c40f78e107f03a3ec13f6cc4a20cc7f39d4d22094b8a7da62d90c82f296; LG_HAS_LOGIN=1; hasDeliver=0; privacyPolicyPopup=False; RECOMMEND_TIP=True; _gid=GA1.2.680545133.1616509146; __lg_stoken__=33d7c59a5ed62375a31f0ac018d9041677374ff83202a54bc6495e585e09bda800f135880120367c61cc5bf7f0d2cf2fca57d100bd375ad3c14e3d1af4c1c2389e5b13668e73; index_location_city=%E5%8C%97%E4%BA%AC; JSESSIONID=ABAAAECABFAACEAE8433D96416EA30546EEDF961A5E51F9; WEBTJ-ID=03242021%2C123852-17862873b3514c-0feb155086e11b-3a7a0a5e-2073600-17862873b3663d; _putrc=67A2493C4727BFD3123F89F2B170EADC; LGSID=20210324123852-1e588694-389b-436a-b5b4-755983bf0c7c; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1616508712,1616560733; sensorsdata2015session=%7B%7D; login=True; unick=%E7%94%A8%E6%88%B77877; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; TG-TRACK-CODE=index_navigation; _gat=1; X_MIDDLE_TOKEN=e1c4bd2934070205fb0668e798ac71e8; SEARCH_ID=a7f08a3861f14e5e915db73e6ff843df; X_HTTP_TOKEN=781ebe0d256c81961483656161973d0afa80152c08; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2221075607%22%2C%22first_id%22%3A%221785f6ec9341d7-0dc7368e1d0412-3a7a0a5e-2073600-1785f6ec93569b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22UNIX%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.141%22%2C%22lagou_company_id%22%3A%22%22%7D%2C%22%24device_id%22%3A%221785f6ec9341d7-0dc7368e1d0412-3a7a0a5e-2073600-1785f6ec93569b%22%7D; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1616563843; LGRID=20210324133042-067772da-a5be-4428-bd15-f83fc41ea22c',
    }
    session = requests.Session()
    res = session.get(url=first_url, headers=header)
    html = BeautifulSoup(res.text, 'lxml')
    jobs = [i.get('href').split('/')[-2] for i in html.select('div.menu_box div.category-list a')]
    print(jobs)
    for j in jobs:
        url = f"https://www.lagou.com/beijing-zhaopin/{j}/"
        get_data(url)
        print(f"完成: {j}岗位的爬取!!!")


if __name__ == '__main__':
    #  csv文件表头
    headers = ['公司名称', '薪资区间', '招聘岗位', '工作地点', '工作经验', '学历要求', '行业领域', '公司规模']
    #  创建data.csv文件
    if not path.exists('./data_update.csv'):
        with open('./data_update.csv', 'w', encoding='utf-8') as wf:
            writer = csv.writer(wf)
            writer.writerow(headers)
    main()
